library(tidyverse)
## ── Attaching packages ───────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Load data
load('01_parsed.Rdata')
25% of programs account for about 50% of (permanent) placements
ggplot(univ_df, aes(perm_placement_rank, frac_cum_perm_placements)) +
geom_step() +
scale_x_continuous(labels = scales::percent_format(),
name = 'PhD Programs') +
scale_y_continuous(labels = scales::percent_format(),
name = 'Permanent Placements')
## Warning: Removed 20 rows containing missing values (geom_path).
## NB Only permanent placements
hiring_network = individual_df %>%
filter(permanent) %>%
select(placing_univ_id, hiring_univ_id, everything()) %>%
graph_from_data_frame(directed = TRUE,
vertices = univ_df)
## 1 giant component contains almost all of the schools
components(hiring_network)$csize
## [1] 1 1 1 1 1 786 1 1 2 1 1 1 1 1 1 1 1
## [18] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [35] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [52] 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1
## [69] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [86] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [103] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [120] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [137] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [154] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [171] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [188] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [205] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [222] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [239] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [256] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [273] 1 1 1 1 1 1 1
Out- and in-centrality
set.seed(42)
V(hiring_network)$in_centrality = eigen_centrality(hiring_network,
directed = TRUE)$vector
graph.reverse <- function (graph) {
if (!is.directed(graph))
return(graph)
e <- get.data.frame(graph, what="edges")
## swap "from" & "to"
neworder <- 1:length(e)
neworder[1:2] <- c(2,1)
e <- e[neworder]
names(e) <- names(e)[neworder]
graph.data.frame(e, vertices = get.data.frame(graph, what="vertices"))
}
V(hiring_network)$out_centrality = hiring_network %>%
graph.reverse() %>%
eigen_centrality(directed = TRUE) %>%
.$vector
## Add centrality statistics to the university df
univ_df = univ_df %>%
mutate(out_centrality = V(hiring_network)[univ_df$univ_id]$out_centrality,
in_centrality = V(hiring_network)[univ_df$univ_id]$in_centrality)
There are two clear groups of centrality scores, with high scores (in the range of 10^-5 to 1) and low scores (10^-15 and smaller).
##
## NB there seem to be (small?) differences in scores (at the low end?) across runs of the centrality algorithm
ggplot(univ_df, aes(out_centrality)) +
geom_density() + geom_rug() +
scale_x_continuous(trans = 'log10')
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 11 rows containing non-finite values (stat_density).
ggplot(univ_df, aes(in_centrality)) +
geom_density() + geom_rug() +
scale_x_continuous(trans = 'log10')
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing non-finite values (stat_density).
ggplot(univ_df, aes(out_centrality, in_centrality,
color = cluster,
text = univ_name)) +
geom_jitter() +
scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
High output is only modestly correlated w/ centrality. Programs like ND, CUNY, New School produce lots of PhDs, but they aren’t placed into the high-centrality departments.
ggplot(univ_df, aes(total_placements, log10(out_centrality)
)) +
geom_point(aes(text = univ_name))
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 913 rows containing missing values (geom_point).
plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
univ_df %>%
mutate(out_centrality_log = log10(out_centrality)) %>%
filter(!is.na(out_centrality_log) &
out_centrality > 0 &
!is.na(total_placements)) %>%
select(total_placements, out_centrality_log) %>%
cor()
## total_placements out_centrality_log
## total_placements 1.0000000 0.3614052
## out_centrality_log 0.3614052 1.0000000
ggplot(univ_df, aes(perm_placement_rank, log10(out_centrality))) +
geom_point()
## Warning: Removed 913 rows containing missing values (geom_point).
ggplot(univ_df, aes(perm_placement_rate, log10(out_centrality),
text = univ_name)) +
geom_point()
## Warning: Removed 913 rows containing missing values (geom_point).
plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## While individuals can move from low to high centrality in temporary positions, this never happens with permanent positions. However, this is expected from the way centrality is calculated.
# dataf %>%
# # filter(permanent) %>%
# left_join(select(univ_df, univ_id, out_centrality),
# by = c('placing_univ_id' = 'univ_id')) %>%
# left_join(select(univ_df, univ_id, out_centrality),
# by = c('hiring_univ_id' = 'univ_id')) %>%
# ggplot(aes(log10(out_centrality.x),
# log10(out_centrality.y))) +
# geom_point() +
# xlab('Placing University Centrality') +
# ylab('Hiring University Centrality') +
# stat_function(fun = function (x) x, linetype = 'dashed') +
# facet_grid(~ permanent)
## steps = 26 has low values for both entropy of cluster sizes (high delta H) and total # clusters
## but somewhere along the way this started producing >300 clusters?
## bc ~300 connected components
## Extract giant component
hiring_network_gc = hiring_network %>%
components %>%
.$csize %>%
{which(. == max(.))} %>%
{components(hiring_network)$membership == .} %>%
which() %>%
induced_subgraph(hiring_network, .)
# walk_len = rep(2:100, 1)
# # ## NB Takes a minute or so
# cluster_stats = walk_len %>%
# map(~ cluster_walktrap(hiring_network_gc, steps = .x)) %>%
# map(~ list(sizes = sizes(.x), length = length(.x))) %>%
# map_dfr(~ tibble(H = -sum(.x$sizes/sum(.x$sizes) * log2(.x$sizes/sum(.x$sizes))),
# n_clusters = .x$length)) %>%
# mutate(walk_len = as.factor(walk_len),
# delta_H = log2(n_clusters) - H)
#
# ggplot(cluster_stats, aes(walk_len, delta_H)) +
# geom_point() +
# coord_flip()
# ggplot(cluster_stats, aes(walk_len, n_clusters)) +
# geom_point() +
# coord_flip()
# ggplot(cluster_stats, aes(n_clusters, delta_H)) +
# geom_text(aes(label = walk_len)) +
# scale_x_continuous()
communities = cluster_walktrap(hiring_network_gc, steps = 26)
V(hiring_network_gc)$community = membership(communities)
univ_df = univ_df %>%
left_join({hiring_network_gc %>%
as_data_frame(what = 'vertices') %>%
select(univ_id = name, community) %>%
mutate(community = as.character(community))})
## Joining, by = "univ_id"
There is no correlation between semantic clusters and topological communities.
univ_df %>%
filter(!is.na(community), !is.na(cluster)) %>%
select(community, cluster) %>%
table() %>%
chisq.test(simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: .
## X-squared = 219.96, df = NA, p-value = 0.2619
univ_df %>%
filter(!is.na(community), !is.na(cluster)) %>%
count(community, cluster) %>%
rename(cluster_n = n) %>%
group_by(community) %>%
mutate(community_tot = sum(cluster_n),
cluster_frac = cluster_n / community_tot,
H = sum(cluster_frac * log2(cluster_frac))) %>%
ggplot(aes(fct_reorder(community, H, .desc = TRUE),
cluster_n, fill = cluster)) +
geom_col() +
coord_flip() +
xlab('Topological communities') +
ylab('# of schools in community') +
scale_fill_brewer(palette = 'Set1',
name = 'Semantic\nclusters')
## Start w/ Oxford, and work upstream
## Only need to go 8 steps to get closure
1:25 %>%
map(~ make_ego_graph(hiring_network, order = .x,
nodes = '512', mode = 'in')) %>%
flatten() %>%
map(~ length(V(.x))) %>%
tibble(order = 1:length(.),
size = unlist(.)) %>%
ggplot(aes(order, size)) + geom_point()
elites = make_ego_graph(hiring_network, order = 8,
nodes = '512',
mode = 'in') %>%
.[[1]]
## How large is the elite community?
## 61 programs; 6% of all programs in the network;
## 39% of programs with at least 1 placement in the dataset
length(V(elites))
## [1] 61
length(V(elites)) / length(V(hiring_network))
## [1] 0.05700935
length(V(elites)) / sum(!is.na(univ_df$total_placements))
## [1] 0.388535
## What fraction of hires are within elites?
## 15% of all permanent hires; 7% of all hires
length(E(elites)) / length(E(hiring_network))
## [1] 0.1542219
length(E(elites)) / nrow(individual_df)
## [1] 0.07828551
# png(file = '02_elite_net.png',
# width = 11, height = 11, units = 'in', res = 400)
ggraph(elites) +
geom_node_label(aes(label = univ_name,
size = log10(out_centrality))) +
geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')),
alpha = .25,
spread = 5) +
scale_size_continuous(range = c(.5, 3)) +
theme_graph()
## Using `nicely` as default layout
# dev.off()
## "Elite" status = high centrality group
univ_df = univ_df %>%
mutate(elite = univ_id %in% V(elites)$name)
ggplot(univ_df, aes(elite, log10(out_centrality))) +
geom_jitter()
## "Elite" status and clusters
## Elites basically don't appear in clusters 2, 3, 7
univ_df %>%
filter(!is.na(cluster)) %>%
ggplot(aes(cluster, color = elite)) +
geom_point(stat = 'count') +
geom_line(aes(group = elite), stat = 'count')
## Elites are mostly confined to the 3 large communities
univ_df %>%
filter(!is.na(community)) %>%
ggplot(aes(as.integer(community), fill = elite)) +
geom_bar()
## What fraction of elite graduates end up in elite programs?
## 221 / (221 + 569) = 28% of those w/ permanent placements
individual_df %>%
filter(permanent) %>%
left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
left_join(univ_df, by = c('hiring_univ_id' = 'univ_id')) %>%
select(elite.x, elite.y) %>%
table()
## elite.y
## elite.x FALSE TRUE
## FALSE 643 0
## TRUE 569 221
Median permanent placement rate for elite programs is 14 points higher than for non-elite programs. However, variation is also wide within each group; even among elite programs, median permanent placement rate is only 58%. This is also not yet controlling for graduation year, area, or demographics.
ggplot(univ_df, aes(elite, perm_placement_rate,
label = univ_name)) +
geom_boxplot(color = 'red') +
geom_jitter() +
scale_y_continuous(labels = scales::percent_format())
## Warning: Removed 913 rows containing non-finite values (stat_boxplot).
## Warning: Removed 913 rows containing missing values (geom_point).
plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Removed 913 rows containing non-finite values (stat_boxplot).
Coarser community structure
large_clusters = which(sizes(communities) > 100)
V(hiring_network_gc)$community_coarse = ifelse(
V(hiring_network_gc)$community %in% large_clusters,
V(hiring_network_gc)$community,
'other')
FR
hiring_network_gc %>%
induced_subgraph(which(degree(., mode = 'out') > 10)) %>%
ggraph() +
# geom_node_label(aes(label = univ_name,
# size = log10(1 + out_centrality),
# color = as.factor(community))) +
geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')),
spread = 5) +
geom_node_point(aes(size = log10(1 + out_centrality),
# color = as.factor(community))) +
color = community_coarse)) +
scale_color_brewer(palette = 'Set1', guide = FALSE) +
scale_size(range = c(2, 6)) +
theme_graph()
## Using `nicely` as default layout
Chord diagram
hiring_network_gc %>%
induced_subgraph(which(degree(., mode = 'out') > 1)) %>%
ggraph(layout = 'linear', sort.by = 'community_coarse', circular = TRUE) +
geom_edge_arc(arrow = arrow(length = unit(.01, 'npc')), alpha = .1) +
geom_node_point(aes(size = log10(1 + out_centrality),
# color = as.factor(community))) +
color = community_coarse)) +
scale_color_brewer(palette = 'Set1', guide = FALSE) +
theme_graph()
## Save university-level data with network statistics
save(univ_df, file = '02_univ_net_stats.Rdata')